initialize_values <- function(data, n, d, K) {
set.seed(123) # Ensure reproducibility
# Initialize mixing proportions π (equal for all Gaussians)
Pis <- rep(1/K, K)
# Initialize means µ randomly chosen from the dataset
Mus <- data[sample(1:n, K), , drop=FALSE] # Select K random points
# Initialize covariance matrices Σ with the overall covariance of the data
Sigmas <- lapply(1:K, function(i) cov(data)) # Copy initial covariance for all clusters
return(list(Pis = Pis, Mus = Mus, Sigmas = Sigmas))
}
convert_to_rgb <- function(k, segmented_img){
my_colors <- rainbow(K)
# Suppose segmented_img is the matrix of labels
height <- nrow(segmented_img)
width <- ncol(segmented_img)
# Create an empty 3D array for the colored image
segmented_rgb <- array(0, dim = c(height, width, 3))
# Fill in the array by mapping each cluster to its color
for (k in seq_len(K)) {
# Convert the k-th color to an (R,G,B) vector in [0,1]
rgb_vals <- col2rgb(my_colors[k]) / 255
# Where the segmentation == k, assign the color
mask <- (segmented_img == k)
segmented_rgb[,,1][mask] <- rgb_vals[1] # R channel
segmented_rgb[,,2][mask] <- rgb_vals[2] # G channel
segmented_rgb[,,3][mask] <- rgb_vals[3] # B channel
}
return(segmented_rgb)
}
library(ggplot2)
library(mvtnorm)
generate_gaussian_mixture_data <- function(n, d, K, seed=42) {
set.seed(seed)
# Equal proportion
Pis <- rep(1/K, K)
# Generate random means and covariance matrices for each Gaussian
Mu <- lapply(1:K, function(i) runif(d, min = 0, max = 10)) # Random means in [0,10]
Sigma <- lapply(1:K, function(i) {
A <- matrix(runif(d*d, min=0.5, max=3), d, d) # Random matrix
return(t(A) %*% A) # Ensures positive semi-definiteness
})
# Initialize data
data <- matrix(0, n, d)
z <- sample(1:K, n, replace=TRUE, prob=Pis) # Assign clusters
for (i in 1:n) {
data[i, ] <- rmvnorm(1, Mu[[z[i]]], Sigma[[z[i]]])
}
return(list(data = data, z = z, Mu = Mu, Sigma = Sigma, Pis = Pis))
}
# Example usage
d = 2 # Dimensions: It's the number of values at each observation (data points)
K = 2 # Number of Gaussian components
n = 300 # Number of data points
data_info <- generate_gaussian_mixture_data(n, d, K, seed=2)
data <- data_info$data
Mus <- data_info$Mu
Sigmas <- data_info$Sigma
Pis <- data_info$Pis
z <- data_info$z
if (d == 1) {
to.plot <- data.frame(x = data[,1], class = as.factor(z))
ggplot(to.plot) + aes(x, fill = class) + geom_density(alpha = 0.5) +
labs(title="1D Gaussian Mixture Density", x="X", y="Density") +
theme_minimal()
}
if (d == 2) {
to.plot = data.frame(x = data[,1], y = data[,2], class = as.factor(z))
ggplot(to.plot) + aes(x, y, color = class) + geom_point() + geom_density_2d()
}
if (d == 3) {
library(scatterplot3d)
scatterplot3d(data, color = as.numeric(z), pch = 19,
main = "3D Gaussian Mixture",
xlab = "X", ylab = "Y", zlab = "Z")
}
compute_gamma <- function(pi, phi) {
gamma <- phi * pi
gamma <- gamma / rowSums(gamma)
return(gamma)
}
compute_l <- function(pi, gamma, phi) {
# Generalized log-likelihood for K components
return(sum(log(rowSums(phi * pi))))
}
EM_algorithm <- function(data, pi, mu, sigma, tol = 10^-2, verbose=FALSE) {
K <- length(pi) # Number of Gaussian components
n <- nrow(data) # Number of data points
d <- ncol(data) # Number of dimensions
# Compute initial phi values for all Gaussians
phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
gamma <- compute_gamma(pi, phi) # Compute probs of belonging
L_temp <- compute_l(pi, gamma, phi) # Compute initial log-likelihood
if(verbose){
cat("Initial log-likelihood: ", L_temp, "\n")
}
L <- 0
iteration_counter <- 0
while (abs(L - L_temp) >= tol) {
L <- L_temp
## Expectation step ##
gamma <- compute_gamma(pi, phi)
## Maximization step ##
Nk <- colSums(gamma) # Number of points assigned to each Gaussian. Sum the columns (prob of belonging of each data point)
pi <- Nk / n # Knowing how many are in each Gaussian, we can update the proportion
# Update means
mu <- t(gamma) %*% data / Nk # The mean of each component is computed by the sum(prob_belonging_component*observation)/Num_of_points_belonging
# Update covariance matrices
sigma <- lapply(1:K, function(k) {
X_centered <- sweep(data, 2, mu[k,]) # Center data
sigma_k <- (t(X_centered) %*% (X_centered * gamma[, k])) / Nk[k]
# --- REGULARIZE the covariance to prevent singularities ---
epsilon <- 1e-6
sigma_k <- sigma_k + diag(epsilon, ncol(data))
return(sigma_k)
})
# Compute new phi values for all Gaussians
phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
# Compute new log-likelihood
L_temp <- compute_l(pi, gamma, phi)
iteration_counter <- iteration_counter + 1
if(verbose){
cat("Iteration: ", iteration_counter, " | Current log-likelihood: ", L_temp, "\n")
}
}
return(list(n_iterations = iteration_counter, pi = pi, mu = mu, sigma = sigma, gamma = gamma))
}
# Run EM Algorithm
init_values <- initialize_values(data, n, d, K)
results <- EM_algorithm(data, init_values$Pis, init_values$Mus, init_values$Sigmas, tol = 10^-3)
cat("EM converged in", results$n_iterations, "iterations.\n")
## EM converged in 12 iterations.
library(mvtnorm)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ellipse)
## Warning: package 'ellipse' was built under R version 4.4.3
##
## Adjuntando el paquete: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
# --- Modified EM_algorithm (Storing History for the First 8 Iterations) ---
compute_gamma <- function(pi, phi) {
gamma <- phi * pi
gamma <- gamma / rowSums(gamma)
return(gamma)
}
compute_l <- function(pi, gamma, phi) {
return(sum(log(rowSums(phi * pi))))
}
EM_algorithm <- function(data, pi, mu, sigma, tol = 1e-2, verbose = FALSE) {
K <- length(pi)
n <- nrow(data)
d <- ncol(data)
phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
gamma <- compute_gamma(pi, phi)
L_temp <- compute_l(pi, gamma, phi)
if(verbose){ cat("Initial log-likelihood: ", L_temp, "\n") }
L <- 0
iteration_counter <- 0
# Store histories for the first 8 iterations
gamma_history <- list()
mu_history <- list()
sigma_history <- list()
while (abs(L - L_temp) >= tol) {
L <- L_temp
gamma <- compute_gamma(pi, phi)
if(iteration_counter < 8) {
gamma_history[[iteration_counter + 1]] <- gamma
mu_history[[iteration_counter + 1]] <- mu
sigma_history[[iteration_counter + 1]] <- sigma
}
Nk <- colSums(gamma)
pi <- Nk / n
mu <- t(gamma) %*% data / Nk
sigma <- lapply(1:K, function(k) {
X_centered <- sweep(data, 2, mu[k,])
sigma_k <- (t(X_centered) %*% (X_centered * gamma[, k])) / Nk[k]
epsilon <- 1e-6
sigma_k <- sigma_k + diag(epsilon, ncol(data))
return(sigma_k)
})
phi <- sapply(1:K, function(k) dmvnorm(data, mu[k,], sigma[[k]]))
L_temp <- compute_l(pi, gamma, phi)
iteration_counter <- iteration_counter + 1
if(verbose){
cat("Iteration: ", iteration_counter, " | Current log-likelihood: ", L_temp, "\n")
}
}
return(list(n_iterations = iteration_counter, pi = pi, mu = mu, sigma = sigma,
gamma = gamma, gamma_history = gamma_history, mu_history = mu_history,
sigma_history = sigma_history))
}
# 1) Generate 1D data (K=2, n=300, etc.)
d <- 1
K <- 2
n <- 300
set.seed(563)
data_info_1d <- generate_gaussian_mixture_data(n, d, K)
data_1d <- data_info_1d$data # shape: (n x 1)
true_labels_1d <- data_info_1d$z
# Quick look at the generated data
df_1d <- data.frame(
x = data_1d[,1],
cluster = factor(true_labels_1d)
)
# Plot the histogram of the generated data (colored by the true cluster)
ggplot(df_1d, aes(x, fill=cluster)) +
geom_histogram(alpha=0.5, position="identity", bins=30) +
theme_minimal() +
ggtitle("Generated 1D Gaussian Mixture")
# 2) Initialize parameters for EM
init_1d <- initialize_values(data_1d, nrow(data_1d), d, K)
# 3) Run EM algorithm with logging
res_1d <- EM_algorithm(
data = data_1d,
pi = init_1d$Pis,
mu = init_1d$Mus,
sigma = init_1d$Sigmas,
tol = 1e-4
)
cat("1D EM converged in", res_1d$n_iterations, "iterations.\n")
## 1D EM converged in 55 iterations.
# 5) Assign each data point to the most probable cluster
gamma_1d <- res_1d$gamma
assigned_1d <- apply(gamma_1d, 1, which.max)
# create a confusion matrix to assignate the correct label to each class
cm <- table(true_labels_1d, assigned_1d)
if (cm[1,1] + cm[2,2] < cm[1,2] + cm[2,1]) {
assigned_1d <- ifelse(assigned_1d == 1, 2, 1)
}
df_1d$assigned <- factor(assigned_1d)
# 6) Visualize final clusters
ggplot(df_1d, aes(x, fill=assigned)) +
geom_histogram(alpha=0.5, position="identity", bins=30) +
theme_minimal() +
ggtitle("1D Gaussian Mixture - EM assigned clusters")
K = 2
# --- 2D Data Generation and Final Clustering Visualization --- #
# 1) Generate 2D data
d <- 2
K <- 2
n <- 300
set.seed(563)
data_info_2d <- generate_gaussian_mixture_data(n, d, K)
data_2d <- data_info_2d$data # shape: (n x 2)
true_labels_2d <- data_info_2d$z
df_2d <- data.frame(
x = data_2d[,1],
y = data_2d[,2],
cluster = factor(true_labels_2d)
)
# Plot the generated data (before EM)
ggplot(df_2d, aes(x, y, color = cluster)) +
geom_point() +
geom_density_2d() +
theme_minimal() +
ggtitle("Generated 2D Gaussian Mixture (True Labels)")
# 2) Initialize parameters
init_2d <- initialize_values(data_2d, nrow(data_2d), d, K)
# 3) Run EM Algorithm (storing history for first 8 iterations)
res_2d <- EM_algorithm(data_2d, init_2d$Pis, init_2d$Mus, init_2d$Sigmas, tol = 1e-4)
cat("2D EM converged in", res_2d$n_iterations, "iterations.\n")
## 2D EM converged in 8 iterations.
# 4) Final cluster assignment
gamma_2d <- res_2d$gamma
assigned_2d <- apply(gamma_2d, 1, which.max)
# Create confusion matrix to re-label if necessary
cm <- table(true_labels_2d, assigned_2d)
if (cm[1,1] + cm[2,2] < cm[1,2] + cm[2,1]) {
assigned_2d <- ifelse(assigned_2d == 1, 2, 1)
}
df_2d$assigned <- factor(assigned_2d)
# Plot the final clustering (after EM)
ggplot(df_2d, aes(x, y, color = assigned)) +
geom_point() +
theme_minimal() +
ggtitle("2D Gaussian Mixture - EM Assigned Clusters")
# --- 5a) Visualization: Gamma Evolution (During EM) --- #
# Plot the evolution of the probability (gamma) for cluster 1
plots_gamma <- list()
for (i in 1:length(res_2d$gamma_history)) {
gamma_iter <- res_2d$gamma_history[[i]]
df_gamma <- data.frame(x = data_2d[,1], y = data_2d[,2], prob_cluster1 = gamma_iter[,1])
p_gamma <- ggplot(df_gamma, aes(x = x, y = y, color = prob_cluster1)) +
geom_point() +
scale_color_gradient(low = "blue", high = "red") +
ggtitle(paste("Gamma Evolution - Iteration", i)) +
theme_minimal()
plots_gamma[[i]] <- p_gamma
}
# Plot gamma evolution in a grid (separately)
grid.arrange(grobs = plots_gamma, ncol = 4, top = "Gamma Evolution (Probability for Cluster 1)")
# --- 5b) Visualization: Parameter Evolution (During EM) --- #
# Plot the evolution of the estimated means and covariance ellipses
plots_params <- list()
for (i in 1:length(res_2d$mu_history)) {
current_mu <- res_2d$mu_history[[i]]
current_sigma <- res_2d$sigma_history[[i]]
p_param <- ggplot(df_2d, aes(x = x, y = y)) +
geom_point(alpha = 0.5, color = "gray") +
ggtitle(paste("Parameter Evolution - Iteration", i)) +
theme_minimal()
for (k in 1:K) {
ellipse_points <- as.data.frame(ellipse(current_sigma[[k]], centre = current_mu[k,], level = 0.95))
p_param <- p_param +
geom_path(data = ellipse_points, aes(x = x, y = y),
color = ifelse(k == 1, "red", "blue"), size = 1) +
geom_point(aes(x = current_mu[k, 1], y = current_mu[k, 2]),
color = ifelse(k == 1, "red", "blue"), size = 3)
}
plots_params[[i]] <- p_param
}
# Plot parameter evolution in a grid (separately)
grid.arrange(grobs = plots_params, ncol = 4, top = "Parameter Evolution (Means & Covariance Ellipses)")
# --- 3D Data Generation ---
set.seed(563)
d <- 3
K <- 2
n <- 300
data_info_3d <- generate_gaussian_mixture_data(n, d, K)
data_3d <- data_info_3d$data # (n x 3)
true_labels_3d <- data_info_3d$z
df_3d <- data.frame(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
cluster = factor(true_labels_3d))
# --- Visualization: Before EM (True Labels) ---
p_before <- plot_ly(df_3d, x = ~x, y = ~y, z = ~z, color = ~cluster,
colors = c("red", "blue"), type = "scatter3d", mode = "markers") %>%
layout(title = "Generated 3D Data (True Labels)")
p_before
# --- Run EM Algorithm ---
init_3d <- initialize_values(data_3d, nrow(data_3d), d, K)
res_3d <- EM_algorithm(data_3d, init_3d$Pis, init_3d$Mus, init_3d$Sigmas, tol = 1e-4, verbose = TRUE)
## Initial log-likelihood: -2573.486
## Iteration: 1 | Current log-likelihood: -2237.384
## Iteration: 2 | Current log-likelihood: -2196.989
## Iteration: 3 | Current log-likelihood: -2173.488
## Iteration: 4 | Current log-likelihood: -2146.89
## Iteration: 5 | Current log-likelihood: -2096.955
## Iteration: 6 | Current log-likelihood: -2020.971
## Iteration: 7 | Current log-likelihood: -1974.299
## Iteration: 8 | Current log-likelihood: -1968.148
## Iteration: 9 | Current log-likelihood: -1967.215
## Iteration: 10 | Current log-likelihood: -1967.092
## Iteration: 11 | Current log-likelihood: -1967.105
## Iteration: 12 | Current log-likelihood: -1967.126
## Iteration: 13 | Current log-likelihood: -1967.14
## Iteration: 14 | Current log-likelihood: -1967.147
## Iteration: 15 | Current log-likelihood: -1967.15
## Iteration: 16 | Current log-likelihood: -1967.152
## Iteration: 17 | Current log-likelihood: -1967.152
## Iteration: 18 | Current log-likelihood: -1967.153
## Iteration: 19 | Current log-likelihood: -1967.153
## Iteration: 20 | Current log-likelihood: -1967.153
# --- Final Clustering: After EM ---
gamma_3d <- res_3d$gamma
assigned_3d <- apply(gamma_3d, 1, which.max)
df_3d$assigned <- factor(assigned_3d)
p_after <- plot_ly(df_3d, x = ~x, y = ~y, z = ~z, color = ~assigned,
colors = c("red", "blue"), type = "scatter3d", mode = "markers") %>%
layout(title = "Final Clustering after EM (3D)")
p_after
# --- 3a) Gamma Evolution Visualization (During EM) ---
gamma_plots_3d <- list()
for (i in 1:length(res_3d$gamma_history)) {
gamma_iter <- res_3d$gamma_history[[i]]
df_gamma <- data.frame(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
prob_cluster1 = gamma_iter[,1])
p_gamma <- plot_ly(df_gamma, x = ~x, y = ~y, z = ~z,
color = ~prob_cluster1, colors = "RdBu", type = "scatter3d", mode = "markers") %>%
layout(title = paste("Gamma Evolution - Iteration", i))
gamma_plots_3d[[i]] <- p_gamma
}
# Display the gamma evolution plots separately (for example, display the first one)
gamma_plots_3d[[1]]
gamma_plots_3d[[2]]
gamma_plots_3d[[3]]
# For generating ellipsoidal points
# --- Helper Function: Generate Ellipsoid Points in 3D ---
ellipsoid_points <- function(mu, sigma, level = 0.95, n = 20) {
# Compute the scaling constant from the chi-square distribution with 3 degrees of freedom
c_val <- qchisq(level, df = 3)
# Create a grid in spherical coordinates
u <- seq(0, 2*pi, length.out = n)
v <- seq(0, pi, length.out = n)
grid <- expand.grid(u = u, v = v)
x <- cos(grid$u) * sin(grid$v)
y <- sin(grid$u) * sin(grid$v)
z <- cos(grid$v)
pts <- cbind(x, y, z) * sqrt(c_val)
# Map unit sphere to ellipsoid via eigen-decomposition of sigma
eig <- eigen(sigma)
ellip <- t(eig$vectors %*% diag(sqrt(eig$values)) %*% t(pts))
ellip <- sweep(ellip, 2, mu, "+")
ellip_df <- data.frame(x = ellip[,1], y = ellip[,2], z = ellip[,3])
return(ellip_df)
}
# --- 3b) Parameter Evolution Visualization (During EM) ---
param_plots_3d <- list()
for (i in 1:length(res_3d$mu_history)) {
current_mu <- res_3d$mu_history[[i]]
current_sigma <- res_3d$sigma_history[[i]]
# Base plot: data in grey
p_param <- plot_ly(x = data_3d[,1], y = data_3d[,2], z = data_3d[,3],
type = "scatter3d", mode = "markers",
marker = list(color = "gray", size = 2)) %>%
layout(title = paste("Parameter Evolution - Iteration", i))
# Add each cluster's mean and ellipsoid
for (k in 1:K) {
# Add mean as a marker
p_param <- p_param %>% add_markers(x = current_mu[k, 1], y = current_mu[k, 2], z = current_mu[k, 3],
marker = list(color = ifelse(k == 1, "red", "blue"), size = 6),
name = paste("Mean", k))
# Compute ellipsoid points and add as semi-transparent markers
ellip_df <- ellipsoid_points(current_mu[k, ], current_sigma[[k]], level = 0.95, n = 20)
p_param <- p_param %>% add_markers(data = ellip_df, x = ~x, y = ~y, z = ~z,
marker = list(color = ifelse(k == 1, "red", "blue"),
size = 2, opacity = 0.3),
showlegend = FALSE)
}
param_plots_3d[[i]] <- p_param
}
# Display the parameter evolution plot for the first iteration
param_plots_3d[[1]]
param_plots_3d[[5]]